This is a walkthrough on how to recreate the hematopoiesis visualizations from Figure 2 of our Cell Systems paper.
Load the required libraries and data. The hematopoiesis data (from Paul et al) can be downloaded here.
library(Seurat)
## Loading required package: ggplot2
## Loading required package: cowplot
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
## Loading required package: Matrix
library(swne)
##
## Attaching package: 'swne'
## The following object is masked from 'package:Seurat':
##
## ExtractField
library(monocle)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:Matrix':
##
## colMeans, colSums, rowMeans, rowSums, which
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
## Loading required package: DDRTree
## Loading required package: irlba
library(ggplot2)
## Load monocle2 object
load("~/swne/Data/hemato_data/hemato_monocle_orig_cds.RData")
## Load counts and informative genes
counts <- ReadData("~/swne/Data/hemato_data/hemato_expr_debatched.tsv", make.sparse = T)
info.genes <- scan("~/swne/Data/hemato_data/hemato_info_genes.txt", sep = "\n", what = character())
## Load clusters
clusters.df <- read.table("~/swne/Data/hemato_data/hemato_cluster_mapping.csv", sep = ",")
clusters <- clusters.df[[2]]; names(clusters) <- clusters.df[[1]]
counts <- counts[,names(clusters)]
Filter genes and cells and make Seurat object
counts <- FilterData(counts, min.samples.frac = 0.005, trim = 0.005, min.nonzero.features = 200)
info.genes <- info.genes[info.genes %in% rownames(counts)]
dim(counts)
## [1] 8653 2730
se.obj <- CreateSeuratObject(counts)
Set cluster names, exclude lymphoid cells and dendritic cells (which are not part of the developmental trajectory), set cluster colors.
se.obj <- SetIdent(se.obj, cells.use = names(clusters), clusters)
se.obj@ident <- plyr::revalue(se.obj@ident,
c("1" = 'Ery', "2" = 'Ery', "3" = 'Ery', "4" = 'Ery', "5" = 'Ery',
"6" = 'Ery', "7" = 'MP/EP', "8" = 'MK', "9" = 'GMP', "10" = 'GMP',
"11" = 'DC', "12" = 'Bas', "13" = 'Bas', "14" = 'M', "15" = 'M',
"16" = 'Neu', "17" = 'Neu', "18" = 'Eos', "19" = 'lymphoid'))
se.obj <- SubsetData(se.obj, ident.remove = c("lymphoid", "DC"))
clusters <- se.obj@ident; names(clusters) <- se.obj@cell.names;
cluster_colors <- c("Bas" = "#ff6347", "Eos" = "#EFAD1E",
"Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859",
"GMP" = "#D097C4", "MK" = "#ACC436", "Neu" = "#F5918A")
Scale data, run PCA, t-SNE, and UMAP, and build the SNN.
## Scale data
se.obj@data <- ScaleCounts(se.obj@raw.data[,se.obj@cell.names], method = "log")
## calculating variance fit ... using gam
se.obj@scale.data <- se.obj@data - Matrix::rowMeans(se.obj@data)
## Run PCA
se.obj <- RunPCA(se.obj, pc.genes = info.genes, do.print = F, pcs.compute = 40)
PCElbowPlot(se.obj, num.pc = 40)
## Run t-SNE, UMAP, and SNN
se.obj <- RunTSNE(se.obj, dims.use = 1:15)
se.obj <- RunUMAP(se.obj, reduction.use = "pca", dims.use = 1:15)
se.obj <- BuildSNN(se.obj, dims.use = 1:15, k.param = 40, prune.SNN = 0.0, force.recalc = T)
Identify number of factors to use for SWNE
k.range <- seq(2,20,2) ## Range of factor values to test
n.cores <- 16 ## Number of cores to use
norm.counts <- se.obj@data[,names(clusters)]
k.err <- FindNumFactors(norm.counts[info.genes,], k.range = k.range, n.cores = n.cores,
seed = 32590, do.plot = F)
PlotFactorSelection(k.err, font.size = 15)
Run SWNE
k <- 12
nmf.res <- RunNMF(norm.counts[info.genes,], k = k, init = "ica", n.cores = n.cores, ica.fast = F) ## Run NMF
nmf.scores <- nmf.res$H
nmf.res$W <- ProjectFeatures(norm.counts, nmf.res$H, n.cores = n.cores) ## Project all genes onto NMF
## Run SWNE embedding
swne.embedding <- EmbedSWNE(nmf.scores, SNN = se.obj@snn, alpha.exp = 1.5, snn.exp = 0.1, n_pull = 3,
dist.use = "cosine")
## Initial stress : 0.09063
## stress after 10 iters: 0.03130, magic = 0.461
## stress after 20 iters: 0.02789, magic = 0.500
## stress after 30 iters: 0.02786, magic = 0.500
Identify key factors and embed key genes. See Table S1 from the paper for how these factors were named and key genes selected.
swne.embedding <- RenameFactors(swne.embedding, name.mapping = c("factor_4" = "Erythrocyte differentiation",
"factor_5" = "Metal binding",
"factor_9" = "Epigenetic regulation",
"factor_10" = "HSC maintenance",
"factor_11" = "Inflammation"))
genes.embed <- c("Apoe", "Mt2", "Gpr56", "Sun2", "Flt3")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed, n_pull = 3)
Make SWNE plot
PlotSWNE(swne.embedding, alpha.plot = 0.4, sample.groups = clusters, do.label = T,
label.size = 3.5, pt.size = 2, show.legend = F) +
scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
Make t-SNE plot
## tSNE plots
tsne.scores <- GetCellEmbeddings(se.obj, reduction.type = "tsne")
PlotDims(tsne.scores, sample.groups = clusters, show.legend = F, show.axes = F,
alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
Make UMAP plot
umap.scores <- GetCellEmbeddings(se.obj, reduction.type = "umap")
PlotDims(umap.scores, sample.groups = clusters, show.legend = F, show.axes = F,
alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
Extract pre-computed pseudotime (computed using Monocle2)
pseudotime <- cds$Pseudotime; names(pseudotime) <- colnames(cds@reducedDimS);
pseudotime <- pseudotime[names(clusters)]
SWNE pseudotime plot
FeaturePlotSWNE(swne.embedding, pseudotime, alpha.plot = 0.4, label.size = 3.5, pt.size = 1.5)
t-SNE pseudotime plot
FeaturePlotDims(tsne.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)
UMAP pseudotime plot
FeaturePlotDims(umap.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)
Now let’s compute a quantitative evaluation of SWNE, tSNE, and UMAP as well as other embeddings.
First we need to compute those other embeddings, including dmaps, MDS, Isomap, LLE, and PCA
library(destiny)
dm <- DiffusionMap(t(as.matrix(norm.counts[info.genes,])), k = 20, n_eigs = 2)
diff.scores <- dm@eigenvectors; rownames(diff.scores) <- colnames(norm.counts);
library(RDRToolbox)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
isomap.scores <- Isomap(t(as.matrix(norm.counts[info.genes,])), dims = 2, k = 20)$dim2
## Computing distance matrix ... done
## Building graph with shortest paths (using 20 nearest neighbours) ... done
## Computing low dimensional embedding ... done
## number of samples: 2669
## reduction from 2978 to 2 dimensions
## number of connected components in graph: 1
rownames(isomap.scores) <- colnames(norm.counts)
lle.scores <- LLE(t(as.matrix(norm.counts[info.genes,])), dim = 2, k = 20)
## Computing distance matrix ... done
## Computing low dimensional emmbedding (using 20 nearest neighbours)... done
rownames(lle.scores) <- colnames(norm.counts)
mds.scores <- cmdscale(dist(t(as.matrix(norm.counts[info.genes,]))), k = 2)
pc.scores <- GetCellEmbeddings(se.obj, dims.use = 1:2)
embeddings <- list(swne = t(as.matrix(swne.embedding$sample.coords)),
tsne = t(tsne.scores),
pca = t(pc.scores),
lle = t(lle.scores),
mds = t(mds.scores),
isomap = t(isomap.scores),
dmaps = t(diff.scores),
umap = t(umap.scores))
Next we define some helper functions that will help us evaluate these embeddings
library(FNN)
library(proxy)
##
## Attaching package: 'proxy'
## The following object is masked from 'package:destiny':
##
## as.matrix
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
knn.idx <- knn.index(t(emb), k = k)
knn.matrix <- matrix(0, ncol(emb), ncol(emb))
for (i in 1:nrow(knn.idx)) {
knn.matrix[knn.idx[i,],i] <- 1
knn.matrix[i, knn.idx[i,]] <- 1
}
rownames(knn.matrix) <- colnames(knn.matrix) <- colnames(emb)
as(knn.matrix, "dgCMatrix")
}
## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
a <- sum(x)
b <- sum(y)
c <- sum(x == 1 & y == 1)
c/(a + b - c)
}
## Function for identifying cells in the same path and time step
GetPathStep <- function(metadata, step.size, make.factor = T) {
path.step <- as.character(metadata$Group); names(path.step) <- rownames(metadata);
for (path in levels(factor(metadata$Group))) {
steps <- sort(unique(subset(metadata, Group == path)$Step))
step.range <- seq(min(steps), max(steps), step.size)
for(i in step.range) {
cells.step <- rownames(subset(metadata, Group == path & Step %in% seq(i, i + step.size - 1, 1)))
path.step[cells.step] <- paste(path, i, sep = ".")
}
}
if (make.factor) {
path.step <- factor(path.step)
}
path.step
}
## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}
Compute how well each embedding maintains local structure compared to the original gene expression space by comparing kNN networks.
n.neighbors <- 40
ref.knn <- ComputeKNN(norm.counts[info.genes,], k = n.neighbors)
## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, ComputeKNN, k = n.neighbors)
knn.simil <- sapply(embeddings.knn, function(knn.emb) {
mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})
Compute how well each embedding maintains global strucutre by computing the centroids of each trajectory-cluster grouping in the embedding space and original gene expression space and correlating the pairwise distances between the centroids.
metadata.df <- data.frame(Group = clusters, Step = order(pseudotime[names(clusters)]))
clusters.steps <- GetPathStep(metadata.df, step.size = 50, make.factor = T)
traj.dist <- CalcPairwiseDist(norm.counts[info.genes,], clusters.steps)
embeddings.cor <- sapply(embeddings, function(emb) {
emb.dist <- CalcPairwiseDist(emb, clusters.steps)
cor(traj.dist, emb.dist)
})
Plot how well each embedding maintaings global vs local structure.
library(ggplot2)
library(ggrepel)
scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
xlab("Neighborhood Score") + ylab("Path-Time Distance Correlation") +
geom_text_repel(aes(x, y, label = name), size = 5) +
xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))